Raw data

Dataset downloaded from mgandal’s github repository.

Load and annotate data

# Load csvs
datExpr = read.csv('./../../../initialExperiments/FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../../../initialExperiments/FirstYearReview/Data/Gandal/RNAseq_ASD_datMeta.csv')

# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))

# Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers, 
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
datMeta$Batch = gsub('/', '.', datMeta$RNAExtractionBatch) %>% as.factor

# Transform Diagnosis into a factor variable
datMeta$Diagnosis_ = factor(datMeta$Diagnosis_, levels=c('CTL','ASD'))


# GO Neuronal annotations
GO_annotations = read.csv('./../../../initialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


rm(GO_annotations)

Check sample composition

Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 63682 genes from 88 samples belonging to 41 different subjects."


Filtering criteria previous to level of expression

getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position

rm(getinfo, mart)


# 1. Filter genes with start or end position missing
to_keep = !is.na(datGenes$length)
print(paste0('Names of the \'Genes\' removed: ', paste(rownames(datExpr)[!to_keep], collapse=', ')))
## [1] "Names of the 'Genes' removed: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique"
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id


# 2. Filter genes that do not encode any protein
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id

# 3. Filter genes with low expression levels
# 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Filtered dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr),
             ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Filtered dataset includes 19426 genes from 88 samples belonging to 41 different subjects."
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "967 SFARI genes remaining"
# Save datasets before level of expression filtering
datExpr_original = datExpr
datGenes_original = datGenes
datMeta_original = datMeta

Filter criteria by mean level of expression

  • Filtering outlier samples (there aren’t many)

  • Creating DESeq object and normalising using vst transformation

#thresholds = c(0.5, 1, seq(2,10), 12, 15, 17, 20, 30, 40, 50, 60, 70, 80, 100)
thresholds = c(0, 0.1, seq(0.2, 2, 0.2), 2.5, 3, 5, 7.5, 10)

for(threshold in thresholds){
  
  datMeta = datMeta_original
  datExpr = datExpr_original
  datGenes = datGenes_original
  
  cat(paste0('\n\nFiltering with threshold: ', threshold,'\n'))
  to_keep = rowMeans(datExpr)>threshold
  datGenes = datGenes[to_keep,]
  datExpr = datExpr[to_keep,]
  
  # Filter outlier samples
  absadj = datExpr %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))
  
  to_keep = abs(z.ku)<2
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  cat(paste0('Removing ', sum(!to_keep), ' samples\n'))
  
  rm(absadj, netsummary, ku, z.ku, to_keep)
  
  
  # Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
  counts = datExpr %>% as.matrix
  rowRanges = GRanges(datGenes$chromosome_name,
                    IRanges(datGenes$start_position, width=datGenes$length),
                    strand=datGenes$strand,
                    feature_id=datGenes$ensembl_gene_id)
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  dds = DESeqDataSet(se, design =~Diagnosis_)
  
  # Perform vst
  vsd = vst(dds)
  
  datExpr_vst = assay(vsd)
  datMeta_vst = colData(vsd)
  datGenes_vst = rowRanges(vsd)
  
  rm(counts, rowRanges, se, vsd)
  
  # Save summary results in dataframe
  if(threshold == thresholds[1]){
    mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
  } else {
    new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
    mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
  }
}  
## 
## 
## Filtering with threshold: 0
## alpha: 1.000000
## Removing 7 samples
## 
## 
## Filtering with threshold: 0.1
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 0.2
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 0.4
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 0.6
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 0.8
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 1
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 1.2
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 1.4
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 1.6
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 1.8
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 2
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 2.5
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 3
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 5
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 7.5
## alpha: 1.000000
## Removing 6 samples
## 
## 
## Filtering with threshold: 10
## alpha: 1.000000
## Removing 6 samples
rm(new_entries)

Mean vs SD plot

to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<6] %>%
            as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=6]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/20)) %>% as.character

plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]

Note: Plotting all of the genes is too heavy, so I’m going to filter most of the genes with the highest levels of expression because we care about the behaviour of the genes with low levels

The weird behaviour from the left tail disappears at around 1.6

ggplotly(plot_data %>% ggplot(aes(Mean, SD)) + 
         geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) + 
         scale_x_log10() + scale_y_log10() + theme_minimal())
rm(to_keep_1, to_keep_2, plot_data)

The trend line doesn’t actually flatten out completely with any threshold, it takes really high thresholds to flatten out the left curve of the trend line, but you lose a lot of genes to achieve this. Maybe the trend line is too sensitive and is not a good representation to select the filtering threshold

ggplotly(mean_vs_sd_data %>% ggplot(aes(Mean, SD)) +
              geom_line(stat='smooth', method='loess', se=FALSE, alpha=0.5, 
                        aes(group=threshold, color=threshold)) +
              ylim(min(mean_vs_sd_data$SD), max(mean_vs_sd_data$SD)) +
              ylab('Remaining Genes') + theme_minimal() + 
              ggtitle('Trend lines for different filtering thresholds'))
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally

ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
         theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))

Conclusion:

The scatter plot and the trend lines seem to give different results, the first one seems to point to a low threshold as the optimum and the second one at a much higher threshold.



Session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.23                  dendextend_1.12.0          
##  [3] vsn_3.52.0                  WGCNA_1.68                 
##  [5] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
##  [7] sva_3.32.1                  genefilter_1.66.0          
##  [9] mgcv_1.8-31                 nlme_3.1-143               
## [11] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [13] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [15] matrixStats_0.54.0          Biobase_2.44.0             
## [17] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [19] IRanges_2.18.2              S4Vectors_0.22.0           
## [21] BiocGenerics_0.30.0         biomaRt_2.40.4             
## [23] ggExtra_0.9                 GGally_1.4.0               
## [25] gridExtra_2.3               viridis_0.5.1              
## [27] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [29] plotlyutils_0.0.0.9000      plotly_4.9.0               
## [31] glue_1.3.1                  reshape2_1.4.3             
## [33] forcats_0.4.0               stringr_1.4.0              
## [35] dplyr_0.8.2                 purrr_0.3.3                
## [37] readr_1.3.1                 tidyr_0.8.3                
## [39] tibble_2.1.3                ggplot2_3.2.0              
## [41] tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.4        Hmisc_4.2-0           
##   [4] plyr_1.8.4             lazyeval_0.2.2         splines_3.6.2         
##   [7] crosstalk_1.0.0        robust_0.4-18.1        digest_0.6.19         
##  [10] foreach_1.4.7          htmltools_0.3.6        GO.db_3.8.2           
##  [13] magrittr_1.5           checkmate_1.9.4        memoise_1.1.0         
##  [16] fit.models_0.5-14      cluster_2.1.0          doParallel_1.0.15     
##  [19] limma_3.40.6           annotate_1.62.0        modelr_0.1.5          
##  [22] prettyunits_1.0.2      colorspace_1.4-1       blob_1.2.0            
##  [25] rvest_0.3.4            rrcov_1.4-7            haven_2.1.1           
##  [28] xfun_0.8               crayon_1.3.4           RCurl_1.95-4.12       
##  [31] jsonlite_1.6           zeallot_0.1.0          impute_1.58.0         
##  [34] survival_3.1-8         iterators_1.0.12       gtable_0.3.0          
##  [37] zlibbioc_1.30.0        XVector_0.24.0         DEoptimR_1.0-8        
##  [40] scales_1.0.0           mvtnorm_1.0-11         DBI_1.0.0             
##  [43] miniUI_0.1.1.1         Rcpp_1.0.1             xtable_1.8-4          
##  [46] progress_1.2.2         htmlTable_1.13.1       foreign_0.8-74        
##  [49] bit_1.1-14             preprocessCore_1.46.0  Formula_1.2-3         
##  [52] htmlwidgets_1.3        httr_1.4.0             acepack_1.4.1         
##  [55] pkgconfig_2.0.2        reshape_0.8.8          XML_3.98-1.20         
##  [58] nnet_7.3-12            locfit_1.5-9.1         labeling_0.3          
##  [61] tidyselect_0.2.5       rlang_0.4.2            later_0.8.0           
##  [64] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.6.2            cli_1.1.0              generics_0.0.2        
##  [70] RSQLite_2.1.2          broom_0.5.2            evaluate_0.14         
##  [73] yaml_2.2.0             bit64_0.9-7            robustbase_0.93-5     
##  [76] mime_0.7               xml2_1.2.2             compiler_3.6.2        
##  [79] rstudioapi_0.10        curl_3.3               affyio_1.54.0         
##  [82] geneplotter_1.62.0     pcaPP_1.9-73           stringi_1.4.3         
##  [85] lattice_0.20-38        Matrix_1.2-18          vctrs_0.2.0           
##  [88] pillar_1.4.2           BiocManager_1.30.4     data.table_1.12.2     
##  [91] bitops_1.0-6           httpuv_1.5.1           affy_1.62.0           
##  [94] R6_2.4.0               latticeExtra_0.6-28    promises_1.0.1        
##  [97] codetools_0.2-16       MASS_7.3-51.5          assertthat_0.2.1      
## [100] withr_2.1.2            GenomeInfoDbData_1.2.1 hms_0.5.1             
## [103] grid_3.6.2             rpart_4.1-15           rmarkdown_1.13        
## [106] shiny_1.3.2            lubridate_1.7.4        base64enc_0.1-3